Retaining either 3 or 6 clusters seems to have be most clear distinctions.
FA process is below.
First we calculate a correlation matrix for the tags.
practices_cor <- tags %>%
select(starts_with("practices")) %>%
cor
practices_jac <- tags %>%
select(starts_with("practices")) %>%
simil(method = "Jaccard", by_rows = FALSE)
plot_tag_cor <- function(cor_mat, title = "") {
ggcorrplot(cor_mat, hc.order = T, type = "upper") +
scale_fill_distiller(type = "div", limits = c(-1, 1), expand = c(0, 0)) +
labs(title = title,
fill = "Correlation") +
scale_x_discrete(labels = label_tags()) +
scale_y_discrete(labels = label_tags()) +
labs(x = "", y = "") +
# scale_fill_cc_gradient +
theme_transcend_sparse +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank()
)
}cor_plot <- plot_tag_cor(practices_cor,
title = "Correlation heat map for all tags") +
theme(legend.position = c(.85, .25))
ggplotly(cor_plot)jac_plot <- plot_tag_cor(as.matrix(practices_jac),
title = "Jaccard similarity for all tags") +
theme(legend.position = c(.85, .25)) +
scale_fill_distiller(type = "div", limits = c(0, 1), expand = c(0, 0))
ggplotly(jac_plot)mat_to_df <- function(m) {
data.frame(row=rownames(m)[row(m)[upper.tri(m)]],
col=colnames(m)[col(m)[upper.tri(m)]],
corr=m[upper.tri(m)])
}
d_corr <- mat_to_df(practices_cor)
d_jacc <- mat_to_df(as.matrix(practices_jac))
jacc_corr <- d_jacc %>%
rename(jaccard = corr) %>%
left_join(d_corr, by = c("row", "col")) %>%
rename(correlation = corr) %>%
mutate(
jacc_rank = rank(-jaccard),
corr_rank = rank(-correlation),
rank_diff = jacc_rank - corr_rank
) %>%
arrange(desc(jaccard))
jacc_corr %>%
datatable(
caption = "Comparison of Jaccard Similarity and Correlation Coeffficients",
colnames = c("Jaccard (0,1)" = "jaccard", "Correlation (-1,1)" = "correlation")
) %>%
formatRound(digits = 2, columns = c("Jaccard (0,1)", "Correlation (-1,1)")) corr_hist <- ggplot(jacc_corr, aes(x = correlation)) +
geom_histogram(binwidth = 0.03, fill = transcend_cols[1]
) +
geom_vline(aes(xintercept = mean(correlation)),
color = transcend_cols[3],
size = 1) +
geom_text(aes(x = mean(correlation), y = Inf,
label = paste("Average:",round(mean(correlation), 2))),
hjust = -.1, check_overlap = TRUE, vjust = 1.1,
family = "Open Sans") +
bar_y_scale_count +
scale_x_continuous(limits = c(-1, 1), expand = expansion(0, 0)) +
labs(title = "Distribution of pairwise tag correlations",
y = "Count of Tag Pairs",
x = "Correlation") +
theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt"))## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
jacc_hist <- ggplot(jacc_corr, aes(x = jaccard)) +
geom_histogram(binwidth = 0.03, fill = transcend_cols[2]
) +
geom_vline(aes(xintercept = mean(jaccard)),
color = transcend_cols[3],
size = 1) +
geom_text(aes(x = mean(jaccard), y = Inf,
label = paste("Average:", round(mean(jaccard), 2))),
hjust = -.1, check_overlap = TRUE, vjust = 1.1,
family = "Open Sans") +
bar_y_scale_count +
scale_x_continuous(limits = c(0, 1), expand = expansion(0, 0)) +
labs(title = "Distribution of pairwise tag similarities",
y = "Count of Tag Pairs",
x = "Jaccard Similarity") +
theme(plot.margin = margin(t = 8, r = 12, b = 8, l = 8, unit = "pt"))
corr_hist + jacc_hist## Warning: Removed 2 rows containing missing values (`geom_bar()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
fa_cor <- fa.parallel(
practices_cor, fm = "pa", fa = "fa", n.obs = nrow(tags),
main = "Correlation scree plot"
)## Parallel analysis suggests that the number of factors = 10 and the number of components = NA
jac_mat <- as.matrix(practices_jac)
jac_mat[is.na(jac_mat)] = 1
fa_jac <- fa.parallel(jac_mat, fm = "pa", fa = "fa",
n.obs = nrow(tags),
main = "Jaccard scree plot"
)## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
# Parallel analysis, doesn't run but can try to figure out if interested
# n_i <- nrow(values_3) # The number of cases in our data
# n_p <- ncol(values_3) # The number of variables in our data
#
# set.seed(2) # To reproduce our randomly generated results.
#
# Eigs <- pca_3$values # The eigenvalues
# n_components <- length(Eigs) # number of components
#
# paral <- parallel(subject = n_i, # The number of cases in our data
# var = n_p, # The number of variables in our data
# rep = 1000,
# quantile = .95,
# model = "components")
#
# ParallelAna <- data.frame(Ncomponent = 1:n_components,
# Eigs,
# RandEigM = paral$eigen$mevpea,
# RandEig95= paral$eigen$qevpea)
#
# ParallelAna <- round(ParallelAna, 3)
# ParallelAna# exceeder <- ParallelAna[ParallelAna[, "RandEig95"] > ParallelAna[, "Eigs"], ][1,]
# exceederefa_3 %>%
model_parameters(sort = TRUE, threshold = "max") %>%
# write_tsv(here("output", "EFA 2023 Results Max.txt"), na = "") %>%
datatable(caption = "Max loadings") %>%
formatRound(digits = 2, columns = 2:6) efa_3 %>%
model_parameters(sort = TRUE, threshold = 0.28) %>%
# write_tsv(here("output", "EFA 2023 Results Threshold.txt"), na = "") %>%
datatable(caption = "Threshold loadings") %>%
formatRound(digits = 2, columns = 2:6) efa_3 %>%
model_parameters(sort = TRUE) %>%
# write_tsv(here("output", "EFA 2023 Results All.txt"), na = "") %>%
datatable(caption = "All loadings") %>%
formatRound(digits = 2, columns = 2:6) efa_4 <- fa(practices_cor, nfactors = 4, rotate = "oblimin", fm = "minres")
# print(efa_4, sort = T)
efa_4 %>%
model_parameters(sort = TRUE, threshold = "max") %>%
datatable(caption = "Max loadings") %>%
formatRound(digits = 2, columns = 2:7) efa_4 %>%
model_parameters(sort = TRUE, threshold = 0.28) %>%
datatable(caption = "Threshold loadings") %>%
formatRound(digits = 2, columns = 2:7) efa_4 %>%
model_parameters(sort = TRUE) %>%
datatable(caption = "All loadings") %>%
formatRound(digits = 2, columns = 2:7) efa_5 <- fa(practices_cor, nfactors = 5, rotate = "oblimin", fm = "minres")
# print(efa_5, sort = T)
efa_5 %>%
model_parameters(sort = TRUE, threshold = "max") %>%
datatable(caption = "Max loadings") %>%
formatRound(digits = 2, columns = 2:8) efa_5 %>%
model_parameters(sort = TRUE, threshold = 0.28) %>%
datatable(caption = "Threshold loadings") %>%
formatRound(digits = 2, columns = 2:8) efa_5 %>%
model_parameters(sort = TRUE) %>%
datatable(caption = "All loadings") %>%
formatRound(digits = 2, columns = 2:8) efa_6 <- fa(practices_cor, nfactors = 6, rotate = "oblimin", fm = "minres")
# print(efa_6, sort = T)
efa_6 %>%
model_parameters(sort = TRUE, threshold = "max") %>%
datatable(caption = "Max loadings") %>%
formatRound(digits = 2, columns = 2:9) efa_6 %>%
model_parameters(sort = TRUE, threshold = .28) %>%
datatable(caption = "Threshold loadings") %>%
formatRound(digits = 2, columns = 2:9) efa_6 %>%
model_parameters(sort = TRUE) %>%
datatable(caption = "All loadings") %>%
formatRound(digits = 2, columns = 2:9)